from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import to_date
sc = SparkContext()
spark = SQLContext(sc)

df1 = spark.read.csv(r'C:\Users\Wang\Desktop\data\stock_small.csv',header=True)
df = df1.iloc[:, [1, 2, 7]]

df['date'] = to_date(df['date'])
df['date.y'] = df['date'].dt.year
new_series = df.groupby(by=['stock_symbol','date.y']).sum('stock_volume')

new_series = new_series.sort_values(by="stock_volume", ascending=False)
print(new_series)
